Lab 4
library(ggplot2)
library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(plotly)##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Continous Distributions and Expectations
1. Uniform Distribution
- Make a large sample and plot a histogram.
This looks like a skyline where all skyscrapers have approximately the same height. :)
x.4 <- runif(1000)
hist(x.4)ggplot2 example:
https://dk81.github.io/dkmathstats_site/rmath-uniform-plots.html
- Also plot the empirical cdf.
This is close to a straight line.
plot.ecdf(x.4)
abline(a=0,b=1, col = 2)- Make two uniformly distributed samples(let’s say
U1andU2) and examine the distribution of the sum (Distribution ofU1 + U2).
Look at the histogram. Does it look like this is uniform?
x.5 <- runif(1000)
x.6 <- runif(1000)
x.7 = x.5 + x.6
hist(x.7)- Plot the empirical cdf. Predict what it will look like before plotting.
plot.ecdf(x.7)- Expected value of Uniform distribution
mean(runif(10000, min = 1, max = 6))## [1] 3.499009
Example 1: Conditional Distribution
x <- runif(1000)# make a sample from U(0,1)
y <- x[x<0.5] # keep only observations < 0.5
z<- x[x>= 0.5] # keep only observations >= 0.5
head(x)## [1] 0.27200895 0.07662424 0.64003796 0.14659809 0.91051626 0.18622363
head(y)## [1] 0.27200895 0.07662424 0.14659809 0.18622363 0.44014818 0.18413936
head(z)## [1] 0.6400380 0.9105163 0.6620075 0.6897932 0.8234334 0.8623052
plot.ecdf(x)plot.ecdf(y)plot.ecdf(z)The result is essentially a straight line (uniform distribution).
# Create a sequence of numbers between -10 and 10 incrementing by 0.1.
x <- seq(-1, 2, by = .1)
# Choose the mean as 2.5 and standard deviation as 0.5.
y <- dunif(x,0,1)
plot(x,y,col="blue", main ="Density Plot of the Uniform Distribution", type="l")# Create a sequence of numbers between -1 and 2 incrementing by 0.2.
x <- seq(-1, 2, by = .1)
# Choose the mean as 2.5 and standard deviation as 2.
y <- punif(x, 0,1)
# Plot the graph.
plot(x,y,type="l", col="red", main ="CDF of the Uniform Distribution" )2. Exponential Distibution
- Density plot
# Initialize some values.
x_lower <- 0
x_upper <- 5
max_height <- max(dexp(x_lower:x_upper, rate = 1, log = FALSE)) #exponential distribution’s density function
max_height## [1] 1
ggplot(data.frame(x = c(x_lower, x_upper)), aes(x = x)) + xlim(x_lower, x_upper) +
ylim(0, max_height) +
stat_function(fun = dexp, args = list(rate = 1), geom = "area", fill = "blue", alpha = 0.25) +
stat_function(fun = dexp, args = list(rate = 1)) +
labs(x = "\n x", y = "f(x) \n", title = "Exponential Distribution Density Plot With Rate = 1 \n") +
theme(plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(face="bold", colour="blue", size = 12),
axis.title.y = element_text(face="bold", colour="blue", size = 12))#Reference:https://dk81.github.io/dkmathstats_site/rvisual-cont-prob-dists.html- Multiple Exponential Distribution Plots:
par(mfrow=c(2,2))
hist(rexp(10))
hist(rexp(100))
hist(rexp(1000))
hist(rexp(10000))library(ggplot2)
x_lower <- 0
x_upper <- 5
max_height2 <- max(dexp(x_lower:x_upper, rate = 1, log = FALSE), dexp(x_lower:x_upper, rate = 2, log = FALSE),
dexp(x_lower:x_upper, rate = 0.5, log = FALSE))
max_height2## [1] 2
ggplot(data.frame(x = c(x_lower, x_upper)), aes(x = x)) + xlim(x_lower, x_upper) +
ylim(0, max_height2) +
stat_function(fun = dexp, args = list(rate = 0.5), aes(colour = "0.5")) +
stat_function(fun = dexp, args = list(rate = 1), aes(colour = "1")) +
stat_function(fun = dexp, args = list(rate = 2), aes(colour = "2")) +
scale_color_manual("Rate", values = c("blue", "green", "red")) +
labs(x = "\n x", y = "f(x) \n",
title = "Exponential Distribution Density Plots \n") +
theme(plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(face="bold", colour="blue", size = 12),
axis.title.y = element_text(face="bold", colour="blue", size = 12),
legend.title = element_text(face="bold", size = 10),
legend.position = "top")# References:
# https://stackoverflow.com/questions/31792634/adding-legend-to-ggplot2-with-multiple-lines-on-plot
# https://stackoverflow.com/questions/19950219/using-legend-with-stat-function-in-ggplot2Example 2: Memoryless Property of Exponential Distribution
Consider \(X ~ exponential\), say with rate = 2. What is the distribution of \(Y = X|X > 2\) ? What is the distribution of \(Y - 2\)?
x <- rexp(10000,2)
y <- x[x>2]
cdf.x <- ecdf(x)
cdf.y <- ecdf(y)
cdf.y.2 <- ecdf(y-2)
t <- seq(0,5,by = .01)
plot(t,cdf.x(t), type = 'l', lwd = 2)
lines(t,cdf.y(t), col = 2)
lines(t,cdf.y.2(t), col = 4)
legend(c("X", "Y = X|X>2", "Y-2"), fill = c(1,2,4), x = 'bottomright')3. Normal Distribution
- Generating a normal random variable.
By default, rnorm() creates standard normal random
variables with a mean of 0 and a standard deviation of 1. However, the
mean and standard deviation can be altered using the mean and sd
arguments.
y=rnorm(100)
mean(y)## [1] -0.1555599
var(y)## [1] 0.9829083
sqrt(var(y))## [1] 0.9914173
sd(y)## [1] 0.9914173
- Density plot
ggplot(data.frame(x = c(-4, 4)), aes(x = x)) +
stat_function(fun = dnorm)# Histogram overlaid with kernel density curve
ggplot(data.frame(x = rnorm(1000)), aes(x = x)) +
geom_histogram(aes(y=..density..), # Histogram with density instead of count on y-axis
binwidth=.5,
colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666") # Overlay with transparent density plot- CDF of the Normal Distribution
# Create a sequence of numbers between -10 and 10 incrementing by 0.2.
x <- seq(-10,10,by = .2)
# Choose the mean as 2.5 and standard deviation as 2.
y <- pnorm(x, mean = 2.5, sd = 2)
# Plot the graph.
plot(x,y,type="l", col="red", main ="CDF of the Normal Distribution" )- Density of the Normal Distribution
# Create a sequence of numbers between -10 and 10 increment by 0.1.
x <- seq(-10, 10, by = .1)
# Choose the mean as 2.5 and standard deviation as 0.5.
y <- dnorm(x, mean = 2.5, sd = 0.5)
plot(x,y,col="blue", main ="Density of the Normal Distribution")- Quantile Function of the Normal Distribution
# Create a sequence of probability values incrementing by 0.02.
x <- seq(0, 1, by = 0.02)
# Choose the mean as 2 and standard deviation as 3.
y <- qnorm(x, mean = 2, sd = 1)
# Plot the graph.
plot(x,y,col="green", type="l", main ="Quantile Function of the Normal Distribution")- Summation of normal distributions
N1=rnorm(1000)
N2=rnorm(1000,mean=50,sd=.1)
Nsum=N1+N2 #summation of normal distributions
mean(Nsum)## [1] 50.0467
mean(N1)+mean(N2)## [1] 50.0467
var(Nsum)## [1] 1.010747
var(N1)+var(N2)## [1] 1.008757
set.seed(1)
hist_data <- data.frame(Nsum)
colnames(hist_data) = c('x')
gg <- ggplot(hist_data,aes(x = x, color = 'density')) +
geom_histogram(aes(y = ..density..), bins = 7, fill = '#67B7D1', alpha = 0.5) +
geom_density(color = '#67B7D1') +
ylab("") +
xlab("") + theme(legend.title=element_blank()) +
ggtitle("Summation of Normal Distributions")+
scale_color_manual(values = c('density' = '#67B7D1'))
ggplotly(gg)%>%
layout(plot_bgcolor='#e5ecf6',
xaxis = list(
title='N1+N2'),
yaxis = list(
title='pdf')) You can see that the sum follows a normal distribution. confirming it with a Q-Q plot below.
df <- data.frame(y = Nsum)
p <- ggplot(df, aes(sample = y))
p + stat_qq() + stat_qq_line()+
ggtitle ("Q-Q plot of the Summation of two Normal Distributions")#check your Directory file for this graph file
pdf("Figure.pdf")
df <- data.frame(y = Nsum)
p <- ggplot(df, aes(sample = y))
p + stat_qq(color='blue') + stat_qq_line()+
ggtitle ("Q-Q plot of the Summation of two Normal Distributions")
dev.off()## quartz_off_screen
## 2
- Correlation of random normal distributions.
x=rnorm(100)
y=rnorm(100)
plot(x,y,xlab="this is the x-axis",ylab="this is the y-axis", main="Plot of X vs Y")Example 3: Shapiro Wilk Test
iris data set gives the measurements in centimeters of
the variables sepal length, sepal width, petal length and petal width,
respectively, for 50 flowers from each of 3 species of iris. The species
are Iris setosa, versicolor, and virginica.
my_data <- iris
head(my_data)## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
- Density plot:
the density plot provides a visual judgment about whether the distribution is bell shaped.
library("ggpubr")
ggdensity(my_data$Sepal.Length,
main = "Density plot of Sepal.Length",
xlab = "Sepal lenngth")Looks like multi-model.
my_data_S=my_data[my_data$Species=="setosa",]
head(my_data_S)## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
ggdensity(my_data_S$Sepal.Length,
main = "Density plot of Sepal.Length for Setosa",
xlab = "Sepal lenngth")- Q-Q plot:
Q-Q plot (or quantile-quantile plot) draws the correlation between a given sample and the normal distribution. A 45-degree reference line is also plotted.
library(ggpubr)
ggqqplot(my_data_S$Sepal.Length)It’s also possible to use the function qqPlot() [in car package]:
library("car")## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
qqPlot(my_data_S$Sepal.Length)## [1] 15 14
Example 4: shapiro.test()
The R function shapiro.test() can be used to perform the Shapiro-Wilk test of normality for one variable (univariate):
shapiro.test(my_data_S$Sepal.Length)##
## Shapiro-Wilk normality test
##
## data: my_data_S$Sepal.Length
## W = 0.9777, p-value = 0.4595
From the output, the p-value > 0.05 implying that the distribution of the data are not significantly different from normal distribution (Rejection rule is if p-value < 0.05 we reject the null hypothesis ).
Here, The null hypothesis of these tests is that “sample distribution is normal”. If the test is significant, the distribution is non-normal.
In other words, we can assume the normality.
For double Checking
shapiro.test(rnorm(1000, mean = 15, sd = 2))##
## Shapiro-Wilk normality test
##
## data: rnorm(1000, mean = 15, sd = 2)
## W = 0.99881, p-value = 0.7609
Example 5: Anderson Darling Test
library(nortest)
ad.test(rnorm(1000, mean = 5, sd = 3)) #normal##
## Anderson-Darling normality test
##
## data: rnorm(1000, mean = 5, sd = 3)
## A = 0.41598, p-value = 0.332
ad.test(runif(1000, min = 2, max = 4)) #not normal##
## Anderson-Darling normality test
##
## data: runif(1000, min = 2, max = 4)
## A = 10.015, p-value < 2.2e-16
Example 6: Kolmogorov-Smirnov Test
x= rnorm(1000, mean = 5, sd = 3)
y=runif(1000, min = 2, max = 4)
# Does x come from a normal(5,3) distribution
ks.test(x,"pnorm",5,3)##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: x
## D = 0.038343, p-value = 0.1057
## alternative hypothesis: two-sided
# Does y come from a normal(5,3) distribution
ks.test(y,"pnorm",5,3)##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: y
## D = 0.63124, p-value < 2.2e-16
## alternative hypothesis: two-sided
# Do x and y come from the same distribution?
ks.test(x, y)##
## Asymptotic two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.656, p-value < 2.2e-16
## alternative hypothesis: two-sided
4. Gamma Distribution
library(ggplot2)
x_lower <- 0
x_upper <- 5
max_height2 <- max(dgamma(x_lower:x_upper, shape = 1, scale = 1), dgamma(x_lower:x_upper, shape = 2, scale = 1),
dgamma(x_lower:x_upper, shape = 2, scale = 3))
ggplot(data.frame(x = c(x_lower, x_upper)), aes(x = x)) + xlim(x_lower, x_upper) +
ylim(0, max_height2) + #g.1
stat_function(fun = dgamma, args = list(shape = 1, scale = 1), aes(colour = "g.1")) +
stat_function(fun = dgamma, args = list(shape = 2, scale = 1), aes(colour = "g.2")) + #g.2
stat_function(fun = dgamma, args = list(shape = 2, scale = 3), aes(colour = "g.3")) + #g.3
scale_color_manual("Distributions", values = c("blue", "green", "red")) +
labs(x = "\n x", y = "f(x) \n",
title = "Gamma Distribution Density Plots \n") +
theme(plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(face="bold", colour="blue", size = 12),
axis.title.y = element_text(face="bold", colour="blue", size = 12),
legend.title = element_text(face="bold", size = 10),
legend.position = "top")Eample 7:
On average, someone sends a money order once per 15 minutes. What is the probability someone sends 10 money orders in less than 3 hours?
\(X\sim Gamma(alpha=10,beta=60/15)\) \(P(X<3)?\)
alpha = 10
beta = 15 / 60 #in R we need to put 1/beta
x = 3
# exact
pgamma(q = x, shape = alpha, scale = beta)## [1] 0.7576078
# simulated
mean(rgamma(n = 10000, shape = alpha, scale = beta) <= x)## [1] 0.7497
5. t-distribution
- Find the 2.5th and 97.5th percentiles of the Student t distribution with 5 degrees of freedom.
qt(c(.025, .975), df=5) # 5 degrees of freedom ## [1] -2.570582 2.570582
x=rt(1000,5)
quantile(x,c(0.025,0.975))## 2.5% 97.5%
## -2.546692 2.605439
x=rt(100000,5)
quantile(x,c(0.025,0.975)) #much closer## 2.5% 97.5%
## -2.576245 2.542914
- Plotting the t-distribution
# Creating the vector with the x-values for dt function
x_dt <- seq(- 5, 5, by = 0.01)
# Applying the dt() function
y_dt <- dt(x_dt, df = 3)
# Plotting
plot(y_dt, type = "l", main = "t-distribution density function example", las=1)6. Cauchy Distribution
Make a sample from a Cauchy distribution and compute the sample mean and median.
z<- rcauchy(100)
mean(z)## [1] -0.8805818
summary(z)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -33.2775 -0.7370 0.0000 -0.8806 0.7786 22.7616
Make many means and medians.
x.1 <- replicate(1000,mean(rcauchy(10000)))
x.2 <- replicate(1000, median(rcauchy(10000)))
summary(x.1)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -412.1741 -1.0458 -0.0313 0.0531 0.8933 314.7184
summary(x.2)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.0551049 -0.0106348 -0.0009373 -0.0008693 0.0095939 0.0483039
Maybe the sample size is too small for computing the mean ?
Plot the ecdf of a suitable sample.
plot.ecdf(rcauchy(1000))Standard deviation of a Cauchy rv DOES NOT EXIST
boxplot(replicate(1000, sd(rcauchy(100))))Extra Examples:
Example 8: Conditional Distribution
Let \(X.0 \sim U(0,3)\), \(X.1 \sim Poisson(4)\), and \(X = X.0 + X.1\)
Make an empirical cdf of X
Make an empirical cdf of \(X|X < 8\) in the same plot
Make an emoirical cdf of \(X.1|X<8\)
N = 10000
X.0 <- runif(N,0,3)
X.1 <- rpois(N,4)
X <- X.0 + X.1
plot.ecdf(X)plot.ecdf(X)
X.2 <- X[X<8]
mycdf = ecdf(X.2)
t <- seq(0,10,by = .01)
lines(t,mycdf(t), col = 2)plot.ecdf(X.1[X<8])